import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.pipeline import PMMLPipeline


def model_train():
    """
    模型训练
    """
    # 1. 加载数据集
    data = pd.read_csv('../Datasets/pima-indians-diabetes.data.csv')
    # 2. 准备数据
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    # test_size 使用 0.2 的数据进行测试，0.8 的数据进行训练
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 3. 训练模型
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    # 4. 进行预测并评估模型
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("准确率：", accuracy)
    model.save_model('pima-indians.model')

    # 转换成 PMML 格式
    pipeline = PMMLPipeline([
        ("classifier", model)
    ])
    sklearn2pmml(pipeline, "pima-indians.pmml", with_repr=True)


def model_test():
    """
    XGB 模型测试
    """
    model = xgb.XGBClassifier()
    model.load_model('pima-indians.model')

    data = pd.read_csv('../Datasets/pima-indians-diabetes.data.csv')
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("XGB 准确率：", accuracy)


def model_pmml_test():
    """
    PMML 文件模型测试
    """
    data = pd.read_csv('../Datasets/pima-indians-diabetes.data.csv')
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    pipeline = PMMLPipeline.fromFile("pima-indians.pmml")
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("PMML 准确率：", accuracy)


if __name__ == '__main__':
    model_train()
    model_test()
    model_pmml_test()
